Fan Wang
July 15 2020
| Project | Data description | Subjects | Source |
|---|---|---|---|
| open-DSFSI (Kaggle) | Epidemiology individual-level line lists in Africa | 10087 | Africa CDC and Ministries of Health |
| open-nCoV2019 (UW) | Epidemiology individual-level line lists around the world | 1083 | University of Washington Institute for Health Metrics and Evaluation |
| TCGA-COAD (GDC) | Clinical data for TCGA Colon Adenocarcinoma samples | 461 | NCI Genomic Data Commons (GDC) |
In this notebook, we obtain the clinical data for multiple projects from Chicagoland Pandemic Response Commons and NCI Genomic Data Commons (GDC) and visualize some of the clinical attributes. This notebook is for demonstration purposes only and not biologically meaningful.
%%capture
import requests
import json
import pandas as pd
import requests, json, fnmatch, os, os.path, sys, subprocess, glob, ntpath, copy, gen3
from gen3.auth import Gen3Auth
from gen3.submission import Gen3Submission
from functools import reduce
# visual libraries
! pip install upsetplot
! pip install joypy
import upsetplot
from upsetplot import UpSet
import warnings
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.graph_objects as go
import joypy
import numpy as np
# notebook setting
warnings.filterwarnings("ignore")
sns.set(style="ticks", color_codes=True)
%config InlineBackend.figure_format = 'svg'
%matplotlib inline
def get_gdc_clinical(project: str):
gdc_url = "https://api.gdc.cancer.gov/cases"
headers = {"Content-Type": "application/json"}
fields = "case_id,submitter_id,project.project_id,project.name"
expand = "diagnoses,demographic,family_histories"
params = {
"filters": {
"op": "in",
"content": {"field": "project.project_id", "value": [project]},
},
"format": "TSV",
"size": "10000",
"fields": fields,
"expand": expand,
}
response = requests.post(
gdc_url, data=json.dumps(params), headers=headers, stream=True
)
return response.content
# TCGA-OV
project = "TCGA-COAD"
ofil = "{0}.clinical.tsv".format(project)
with open(ofil, "wt") as o:
o.write(get_gdc_clinical(project).decode("utf-8"))
gdc_df = pd.read_csv(ofil, sep="\t")
Age at Diagnosis and Year to Death by race in TCGA-COAD¶df = gdc_df[
["demographic.race", "diagnoses.0.age_at_diagnosis", "demographic.days_to_death"]
]
df["Age at diagnosis"] = df["diagnoses.0.age_at_diagnosis"] / 365
df["Year to death"] = df["demographic.days_to_death"] / 365
df["Race"] = df["demographic.race"]
df = df.dropna(subset=["Age at diagnosis"])
plt.figure(figsize=(8, 6), dpi=200)
fig, axes = joypy.joyplot(
df.loc[(df.Race != "native hawaiian or other pacific islander") & (df.Race != "asian") & (df.Race != "american indian or alaska native")],
column=["Age at diagnosis", "Year to death"],
by="Race",
ylim="own",
figsize=(8, 6),
overlap=1,
legend=True,
)
plt.title("Distribution of Age at Diagnosis and Year to Death by Race", fontsize=15)
plt.show()
Tumor Stage and Vital Status in TCGA-COAD¶plt.figure(figsize=(10, 8), dpi=200)
vital_figo_df = (
gdc_df.loc[:, ["case_id", "demographic.vital_status", "diagnoses.0.tumor_stage"]]
.groupby(["demographic.vital_status", "diagnoses.0.tumor_stage"])
.size()
.reset_index(name="counts")
)
vital_figo_df = vital_figo_df.loc[
vital_figo_df["demographic.vital_status"] != "Not Reported"
]
table = pd.pivot_table(
vital_figo_df,
columns=["demographic.vital_status"],
values=["counts"],
index="diagnoses.0.tumor_stage",
aggfunc=np.sum,
fill_value=0,
)
table["index"] = range(14)
g = sns.catplot(
x="counts",
y="diagnoses.0.tumor_stage",
data=vital_figo_df,
hue="demographic.vital_status",
kind="bar",
height=6,
aspect=1.5,
legend=False,
)
for row in table.itertuples():
g.ax.text(
row[1] + 6,
row[3] - 0.1,
"n=" + str(row[1]),
horizontalalignment="center",
size=9,
color="black",
)
for row in table.itertuples():
g.ax.text(
row[2] + 3,
row[3] + 0.28,
"n=" + str(row[2]),
horizontalalignment="center",
size=9,
color="black",
)
plt.xlabel("Number of Samples")
plt.ylabel("Tumor Stage")
plt.yticks(fontsize=10)
plt.title("Sample Size by Tumor Stage and Vital Status", fontsize=15)
plt.legend()
plt.show()
To extract the data we need, we simply export the demographic and observation nodes from the Chicagoland Pandemic Response Commons.
api = "https://chicagoland.pandemicresponsecommons.org/"
creds = "credentials_covid.json"
auth = Gen3Auth(api, refresh_file=creds)
sub = Gen3Submission(api, auth)
def get_node_tsvs(node, projects=None, overwrite=False):
# Get a TSV of the node(s) specified for each project specified
if not isinstance(node, str): # Create folder on VM for downloaded files
mydir = "downloaded_tsvs"
else:
mydir = str(node + "_tsvs")
if not os.path.exists(mydir):
os.makedirs(mydir)
if projects is None: # if no projects specified, get node for all projects
project_ids = list(
json_normalize(
sub.query("""{project (first:0){project_id}}""")["data"]["project"]
)["project_id"]
)
elif isinstance(projects, str):
projects = [projects]
dfs = []
df_len = 0
for project in projects:
filename = str(mydir + "/" + project + "_" + node + ".tsv")
if (os.path.isfile(filename)) and (overwrite is False):
print("File previously downloaded.")
else:
prog, proj = project.split("-", 1)
sub.export_node(prog, proj, node, "tsv", filename)
df1 = pd.read_csv(filename, sep="\t", header=0)
dfs.append(df1)
df_len += len(df1)
print(filename + " has " + str(len(df1)) + " records.")
all_data = pd.concat(dfs, ignore_index=True)
print("length of all dfs: " + str(df_len))
nodefile = str("master_" + node + ".tsv")
all_data.to_csv(str(mydir + "/" + nodefile), sep="\t")
print(
"Master node TSV with "
+ str(len(all_data))
+ " total records written to "
+ nodefile
+ "."
)
return all_data
# Get subject node tsvs
demographic_DSFSI = get_node_tsvs("demographic", "open-DSFSI")
demographic_nCOV2019 = get_node_tsvs("demographic", "open-nCoV2019")
observation_nCOV2019 = get_node_tsvs("observation", "open-nCoV2019")
demographic_nCOV2019 = demographic_nCOV2019[["age", "gender", "subjects.submitter_id"]]
observation_nCOV2019 = observation_nCOV2019[["symptoms", "subjects.submitter_id"]]
nCOV2019 = pd.merge(
demographic_nCOV2019, observation_nCOV2019, on="subjects.submitter_id", how="inner"
)
ncov2019 = nCOV2019[["age", "symptoms"]].dropna(subset=["symptoms"])
def harmonize_symptoms(dataframe):
dataframe["symptom"] = dataframe["symptoms"].str.split(",")
for symptom in set.union(*dataframe.symptom.apply(set)):
dataframe[symptom] = dataframe.apply(
lambda _: int(symptom in _.symptom), axis=1
)
dataframe = dataframe.drop(["symptoms", "symptom"], axis=1)
symptoms = dataframe.columns.tolist()
symptoms.remove("age")
for symptom in symptoms:
dataframe = dataframe.replace({symptom: {0: False, 1: True}})
return dataframe
ncov2019_symptom = harmonize_symptoms(ncov2019)
symptoms = ncov2019_symptom.columns.tolist()
symptoms.remove("age")
ncov2019_symptom = ncov2019_symptom.set_index(symptoms)
upset = UpSet(
ncov2019_symptom, subset_size="count", intersection_plot_elements=10, element_size=16
)
upset.add_catplot(value="age", kind="boxen", elements=8)
upset.plot()
plt.show()
gdc_gender_percent = gdc_df[
["diagnoses.0.age_at_diagnosis", "demographic.gender"]
].dropna(subset=["diagnoses.0.age_at_diagnosis", "demographic.gender"])
gdc_gender_percent["age"] = gdc_gender_percent["diagnoses.0.age_at_diagnosis"] / 365
def define_age_decade(dataset):
dataset.loc[(dataset["age"] < 10), "age_decade"] = "0s"
dataset.loc[(dataset["age"] < 20) & (dataset["age"] >= 10), "age_decade"] = "10s"
dataset.loc[(dataset["age"] < 30) & (dataset["age"] >= 20), "age_decade"] = "20s"
dataset.loc[(dataset["age"] < 40) & (dataset["age"] >= 30), "age_decade"] = "30s"
dataset.loc[(dataset["age"] < 50) & (dataset["age"] >= 40), "age_decade"] = "40s"
dataset.loc[(dataset["age"] < 60) & (dataset["age"] >= 50), "age_decade"] = "50s"
dataset.loc[(dataset["age"] < 70) & (dataset["age"] >= 60), "age_decade"] = "60s"
dataset.loc[(dataset["age"] < 80) & (dataset["age"] >= 70), "age_decade"] = "70s"
dataset.loc[(dataset["age"] < 90) & (dataset["age"] >= 80), "age_decade"] = "80s"
dataset.loc[(dataset["age"] < 100) & (dataset["age"] >= 90), "age_decade"] = "90s"
dataset.loc[(dataset["age"] >= 100), "age_decade"] = "100s"
return dataset[dataset.age_decade.notna()]
Calculate fatality rates
The fatality rate is the number of confirmed deaths divided by the number of confirmed cases.
# Calculate male percentage for TCGA-COAD
gdc_gender_percent = define_age_decade(gdc_gender_percent[["age", "demographic.gender"]])
gdc_male_percent = (
gdc_gender_percent.loc[gdc_gender_percent["demographic.gender"] == "male"]
.groupby(["age_decade"])
.count()
/ gdc_gender_percent.shape[0]
)
gdc_male_percent = gdc_male_percent.rename(
columns={"age": "TCGA: Colon Adenocarcinoma"}
)
gdc_male_percent = gdc_male_percent.drop(["demographic.gender"], axis=1)
# Calculate male percentage for DSFSI
demographic_DSFSI = demographic_DSFSI.dropna(subset=["age", "gender"])
DSFSI_gender_percent = define_age_decade(demographic_DSFSI[["age", "gender"]])
DSFSI_male_percent = (
DSFSI_gender_percent.loc[DSFSI_gender_percent["gender"] == "Male"]
.groupby(["age_decade"])
.count()
/ DSFSI_gender_percent.shape[0]
)
DSFSI_male_percent = DSFSI_male_percent.rename(
columns={"age": "COVID-19 in Africa"}
)
DSFSI_male_percent = DSFSI_male_percent.drop(["gender"], axis=1)
# Calculate male percentage for nCoV2019
demographic_nCOV2019 = demographic_nCOV2019.dropna(subset=["age", "gender"])
nCOV2019_gender_percent = define_age_decade(demographic_nCOV2019[["age", "gender"]])
nCOV2019_male_percent = (
nCOV2019_gender_percent.loc[nCOV2019_gender_percent["gender"] == "Male"]
.groupby(["age_decade"])
.count()
/ nCOV2019_gender_percent.shape[0]
)
nCOV2019_male_percent = nCOV2019_male_percent.rename(
columns={"age": "COVID-19 in nCOV2019"}
)
nCOV2019_male_percent = nCOV2019_male_percent.drop(["gender"], axis=1)
data_frames = [
DSFSI_male_percent,
nCOV2019_male_percent,
gdc_male_percent
]
fatality_all = reduce(
lambda left, right: pd.merge(left, right, on=["age_decade"], how="outer"),
data_frames,
)
age_decade = fatality_all.index
fig = go.Figure(
data=[
go.Bar(name="COVID-19 in Africa", x=age_decade, y=fatality_all["COVID-19 in Africa"]),
go.Bar(name="COVID-19 in nCOV2019", x=age_decade, y=fatality_all["COVID-19 in nCOV2019"]),
go.Bar(name="TCGA: Colon Adenocarcinoma (GDC)", x=age_decade, y=fatality_all['TCGA: Colon Adenocarcinoma']),
]
)
fig.update_layout(
yaxis=dict(tickformat=".0%"),
barmode="group",
title="Male Percentage by Age in COVID-19 and TCGA Colon Adenocarcinoma",
xaxis_title="Age",
yaxis_title="Male Percentage",
legend_title="Datasets",
font=dict(size=15),
)
fig.show()